## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /var/folders/qw/2tnkb3b11dncn1d6lmqs7rh40000gn/T//RtmpT5ZBxS/h2o_krishnaprasad_started_from_r.out
##     /var/folders/qw/2tnkb3b11dncn1d6lmqs7rh40000gn/T//RtmpT5ZBxS/h2o_krishnaprasad_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 seconds 864 milliseconds 
##     H2O cluster timezone:       America/Denver 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.28.0.2 
##     H2O cluster version age:    1 month and 6 days  
##     H2O cluster name:           H2O_started_from_R_krishnaprasad_aty884 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.00 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.1 (2019-07-05)
# Read Classification dataset from RDS
class.data <- readRDS("KDEN_Class_Data_New.RDS")

# Reorder data frame by Date 
class.data <- class.data[order(class.data$DATE), ]

# class.data$PRCP_LAG_1 <- lag(class.data$PRCP, k = 1)

# class.data$TEMP_LAG_1 <- lag(class.data$TEMP, k = 1)

class.data <- class.data[complete.cases(class.data), ]

class.data <-
  class.data[!(class.data$MXSPD == 999.9 |
                 class.data$PRCP == 99.99), ] #| class.data$PRCP_LAG_1 == 99.99

class.data$FOG <- as.factor(class.data$FOG)
class.data$SNOW_ICE <- as.factor(class.data$SNOW_ICE)

class.data <- class.data %>%
  mutate(
    WEEK = lubridate::week(class.data$DATE),
    YEAR = lubridate::year(class.data$DATE)
  ) %>%
  mutate(RATIO = STRIKECOUNT / FLIGHTCOUNT * 10000)

t.data <- class.data %>%
  mutate(RATIO = STRIKECOUNT / FLIGHTCOUNT * 10000) %>%
  group_by(YEAR, WEEK) %>%
  summarise(RATIO = mean(RATIO)) %>%
  mutate(RISK = .bincode(
    RATIO,
    breaks = quantile(RATIO, probs = seq(0, 1, 1 / 3)),
    include.lowest = TRUE
  ) - 1) %>%
  dplyr::select(-RATIO)

class.data <-
  left_join(class.data, t.data, by = c("YEAR" = "YEAR", "WEEK" = "WEEK"))

class.data$RISK <-
  as.factor(ifelse(class.data$RISK == 0, "L", ifelse(class.data$RISK == 1, "M", "H")))
# one-hot-encoding categorical features
ohe_feats = c('MONTH')

# Create dummies
dummies <- dummyVars(~ MONTH, data = class.data)

df.dummies <- as.data.frame(predict(dummies, newdata = class.data))

# Merge Dummies to data frame
class.data <-
  cbind(class.data[, -c(which(colnames(class.data) %in% ohe_feats))], df.dummies)


valid.cl.data <-
  class.data[(class.data$YEAR == 2019), ]

class.data <-
  class.data %>% filter(!class.data$YEAR %in% c(1995:2007, 2019))

class.data <- subset(class.data, select = -c(DATE, YEAR, SEASON, MXSPD, SNOW_ICE, STRIKECOUNT, STRIKE, WEEK, RATIO, MONTH.12))

valid.cl.data <- subset(valid.cl.data, select = -c(DATE, YEAR, SEASON, MXSPD, SNOW_ICE, STRIKECOUNT, STRIKE, WEEK, RATIO, MONTH.12))
# Create the training and test datasets
set.seed(100)

class.data$RISK <- as.factor(class.data$RISK)

# Step 1: Get row numbers for the training data
trainRowNumbers.cl <-
  createDataPartition(class.data$RISK, p = 0.70, list = FALSE)

# Step 2: Create the training  dataset
train.data <- class.data[trainRowNumbers.cl, ]

# Step 3: Create the test dataset
test.data <- class.data[-trainRowNumbers.cl, ]
validateAndPrintResult <- function(model, data) {
  # Summarise Results
  print(model)
  
  ## run MLeval
  res <- evalm(model)
  
  ## get ROC
  
  res$roc
  
  ## get calibration curve
  
  res$cc
  
  ## get precision recall gain curve
  
  res$prg
  
  # Predict on testData
  predicted.resp <- predict(model, data)
  head(predicted.resp)
  
  caret::confusionMatrix(
    reference = as.factor(data$RISK),
    data = predicted.resp,
    mode = 'everything',
    positive = 'YES'
  )
}
trControl <- trainControl(
  method = "cv",
  number = 7,
  savePredictions = "final",
  # index = createResample(as.factor(train.data$RISK), 7),
  classProbs = TRUE,
  summaryFunction = multiClassSummary
)

multinom.grid <- expand.grid(decay = 0)

xgbTreeGrid <-
  expand.grid(
    nrounds = 500,
    max_depth = seq(2, 8, by = 1),
    eta = 0.1,
    gamma = 0,
    colsample_bytree = 1.0,
    subsample = 1.0,
    min_child_weight = 4
  )

glmnetGridElastic <-
  expand.grid(.alpha = 0.3, .lambda = 0.009) ## notice the . before the parameter


gbm.tune.grid <-
  expand.grid(
    n.trees = c(400),
    interaction.depth = c(1, 3, 5),
    shrinkage = c(.01, .1, .3),
    n.minobsinnode = c(5, 10, 15)
  )


set.seed(333)

modelList <- caretList(
  RISK ~ .,
  
  train.data,
  
  trControl = trControl,
  
  metric = "logLoss",
  
  verbose = TRUE,
  
  tuneList = list(
    
    # Multinomial Logistic regression is using multinom method from nnet package
    multinom = caretModelSpec(method = 'multinom',
                              maxit = 150,
                              tuneGrid = multinom.grid), 
    
    ## Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
    
    xgbTree = caretModelSpec(
      method = "xgbTree",
      tuneGrid = xgbTreeGrid,
      nthread = 8
    ),

    glmnet = caretModelSpec(method = "glmnet", tuneGrid = glmnetGridElastic),
    # Elastic, highly correlated with lasso and ridge regressions

    rf = caretModelSpec(
      method = "rf",
      ntree = 2000,
      tuneLength = 20,
      tuneGrid = data.frame(mtry = 10)
    ),
    # rf

    gbm = caretModelSpec(method = "gbm", tuneGrid = gbm.tune.grid)
    
  )
)
## Warning in trControlCheck(x = trControl, y = target): indexes not defined in
## trControl. Attempting to set them ourselves, so each model in the ensemble will
## have the same resampling indexes.
## # weights:  63 (40 variable)
## initial  value 3091.494980 
## iter  10 value 2610.059596
## iter  20 value 2240.432174
## iter  30 value 2120.343168
## iter  40 value 2102.297073
## iter  50 value 2101.755794
## final  value 2101.755759 
## converged
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        1.0986             nan     0.0100    0.0154
##      2        1.0902             nan     0.0100    0.0149
##      3        1.0822             nan     0.0100    0.0152
##      4        1.0739             nan     0.0100    0.0144
##      5        1.0660             nan     0.0100    0.0136
##      6        1.0586             nan     0.0100    0.0130
##      7        1.0514             nan     0.0100    0.0131
##      8        1.0443             nan     0.0100    0.0122
##      9        1.0374             nan     0.0100    0.0118
##     10        1.0307             nan     0.0100    0.0119
##     20        0.9728             nan     0.0100    0.0092
##     40        0.8949             nan     0.0100    0.0049
##     60        0.8465             nan     0.0100    0.0031
##     80        0.8146             nan     0.0100    0.0019
##    100        0.7926             nan     0.0100    0.0009
##    120        0.7762             nan     0.0100    0.0005
##    140        0.7634             nan     0.0100    0.0005
##    160        0.7529             nan     0.0100    0.0005
##    180        0.7436             nan     0.0100    0.0002
##    200        0.7360             nan     0.0100   -0.0000
##    220        0.7287             nan     0.0100    0.0001
##    240        0.7221             nan     0.0100    0.0001
##    260        0.7158             nan     0.0100    0.0001
##    280        0.7106             nan     0.0100   -0.0001
##    300        0.7054             nan     0.0100   -0.0001
##    320        0.7007             nan     0.0100   -0.0001
##    340        0.6962             nan     0.0100   -0.0001
##    360        0.6919             nan     0.0100   -0.0003
##    380        0.6875             nan     0.0100   -0.0001
##    400        0.6832             nan     0.0100   -0.0001
validateAndPrintResult(modelList$multinom, test.data)
## Penalized Multinomial Regression 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results:
## 
##   logLoss    AUC        prAUC      Accuracy   Kappa      Mean_F1  
##   0.7664346  0.8053847  0.6387627  0.6400277  0.4590557  0.6267216
##   Mean_Sensitivity  Mean_Specificity  Mean_Pos_Pred_Value  Mean_Neg_Pred_Value
##   0.6350695         0.8204098         0.627725             0.8257711          
##   Mean_Precision  Mean_Recall  Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.627725        0.6350695    0.2133426            0.7277396             
## 
## Tuning parameter 'decay' was held constant at a value of 0
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.578003211449456
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   H   L   M
##          H 310  10  81
##          L  22 317 151
##          M  81  78 153
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6484          
##                  95% CI : (0.6206, 0.6754)
##     No Information Rate : 0.3433          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4716          
##                                           
##  Mcnemar's Test P-Value : 4.058e-06       
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.7506   0.7827   0.3974
## Specificity            0.8848   0.7832   0.8056
## Pos Pred Value         0.7731   0.6469   0.4904
## Neg Pred Value         0.8716   0.8766   0.7396
## Precision              0.7731   0.6469   0.4904
## Recall                 0.7506   0.7827   0.3974
## F1                     0.7617   0.7084   0.4390
## Prevalence             0.3433   0.3367   0.3200
## Detection Rate         0.2577   0.2635   0.1272
## Detection Prevalence   0.3333   0.4073   0.2594
## Balanced Accuracy      0.8177   0.7830   0.6015
validateAndPrintResult(modelList$multinom, valid.cl.data)
## Penalized Multinomial Regression 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results:
## 
##   logLoss    AUC        prAUC      Accuracy   Kappa      Mean_F1  
##   0.7664346  0.8053847  0.6387627  0.6400277  0.4590557  0.6267216
##   Mean_Sensitivity  Mean_Specificity  Mean_Pos_Pred_Value  Mean_Neg_Pred_Value
##   0.6350695         0.8204098         0.627725             0.8257711          
##   Mean_Precision  Mean_Recall  Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.627725        0.6350695    0.2133426            0.7277396             
## 
## Tuning parameter 'decay' was held constant at a value of 0
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.578003211449456
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  H  L  M
##          H 79 21 21
##          L  0 48 35
##          M 25 32 42
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5578          
##                  95% CI : (0.4999, 0.6145)
##     No Information Rate : 0.3432          
##     P-Value [Acc > NIR] : 1.869e-14       
##                                           
##                   Kappa : 0.3359          
##                                           
##  Mcnemar's Test P-Value : 8.359e-05       
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.7596   0.4752   0.4286
## Specificity            0.7889   0.8267   0.7220
## Pos Pred Value         0.6529   0.5783   0.4242
## Neg Pred Value         0.8626   0.7591   0.7255
## Precision              0.6529   0.5783   0.4242
## Recall                 0.7596   0.4752   0.4286
## F1                     0.7022   0.5217   0.4264
## Prevalence             0.3432   0.3333   0.3234
## Detection Rate         0.2607   0.1584   0.1386
## Detection Prevalence   0.3993   0.2739   0.3267
## Balanced Accuracy      0.7743   0.6510   0.5753
validateAndPrintResult(modelList$xgbTree, test.data)
## eXtreme Gradient Boosting 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results across tuning parameters:
## 
##   max_depth  logLoss    AUC        prAUC      Accuracy   Kappa      Mean_F1  
##   2          0.7735971  0.8083205  0.6459289  0.6336302  0.4492462  0.6193716
##   3          0.8012811  0.8046506  0.6458706  0.6265220  0.4388260  0.6149632
##   4          0.8368296  0.7993913  0.6425343  0.6229674  0.4337348  0.6151014
##   5          0.8741677  0.7973666  0.6402046  0.6155065  0.4225058  0.6075197
##   6          0.9120783  0.7959673  0.6368211  0.6066276  0.4091690  0.5986975
##   7          0.9360382  0.7962670  0.6378808  0.6105331  0.4151823  0.6038741
##   8          0.9674818  0.7951167  0.6373674  0.6084001  0.4118915  0.6010594
##   Mean_Sensitivity  Mean_Specificity  Mean_Pos_Pred_Value  Mean_Neg_Pred_Value
##   0.6287187         0.8170270         0.6210115            0.8230597          
##   0.6219906         0.8136201         0.6153662            0.8179100          
##   0.6190170         0.8119166         0.6154902            0.8144229          
##   0.6113308         0.8082161         0.6069299            0.8105530          
##   0.6024505         0.8037839         0.5976849            0.8059701          
##   0.6065675         0.8058002         0.6036407            0.8074718          
##   0.6043163         0.8046929         0.6004833            0.8066484          
##   Mean_Precision  Mean_Recall  Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.6210115       0.6287187    0.2112101            0.7228728             
##   0.6153662       0.6219906    0.2088407            0.7178054             
##   0.6154902       0.6190170    0.2076558            0.7154668             
##   0.6069299       0.6113308    0.2051688            0.7097735             
##   0.5976849       0.6024505    0.2022092            0.7031172             
##   0.6036407       0.6065675    0.2035110            0.7061838             
##   0.6004833       0.6043163    0.2028000            0.7045046             
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
##  parameter 'min_child_weight' was held constant at a value of 4
## 
## Tuning parameter 'subsample' was held constant at a value of 1
## logLoss was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
##  = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 4 and subsample = 1.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.579887209008348
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   H   L   M
##          H 295  16  69
##          L  33 313 160
##          M  85  76 156
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6351          
##                  95% CI : (0.6072, 0.6623)
##     No Information Rate : 0.3433          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4518          
##                                           
##  Mcnemar's Test P-Value : 3.68e-08        
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.7143   0.7728   0.4052
## Specificity            0.8924   0.7581   0.8032
## Pos Pred Value         0.7763   0.6186   0.4921
## Neg Pred Value         0.8566   0.8680   0.7415
## Precision              0.7763   0.6186   0.4921
## Recall                 0.7143   0.7728   0.4052
## F1                     0.7440   0.6872   0.4444
## Prevalence             0.3433   0.3367   0.3200
## Detection Rate         0.2452   0.2602   0.1297
## Detection Prevalence   0.3159   0.4206   0.2635
## Balanced Accuracy      0.8033   0.7655   0.6042
validateAndPrintResult(modelList$xgbTree, valid.cl.data)
## eXtreme Gradient Boosting 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results across tuning parameters:
## 
##   max_depth  logLoss    AUC        prAUC      Accuracy   Kappa      Mean_F1  
##   2          0.7735971  0.8083205  0.6459289  0.6336302  0.4492462  0.6193716
##   3          0.8012811  0.8046506  0.6458706  0.6265220  0.4388260  0.6149632
##   4          0.8368296  0.7993913  0.6425343  0.6229674  0.4337348  0.6151014
##   5          0.8741677  0.7973666  0.6402046  0.6155065  0.4225058  0.6075197
##   6          0.9120783  0.7959673  0.6368211  0.6066276  0.4091690  0.5986975
##   7          0.9360382  0.7962670  0.6378808  0.6105331  0.4151823  0.6038741
##   8          0.9674818  0.7951167  0.6373674  0.6084001  0.4118915  0.6010594
##   Mean_Sensitivity  Mean_Specificity  Mean_Pos_Pred_Value  Mean_Neg_Pred_Value
##   0.6287187         0.8170270         0.6210115            0.8230597          
##   0.6219906         0.8136201         0.6153662            0.8179100          
##   0.6190170         0.8119166         0.6154902            0.8144229          
##   0.6113308         0.8082161         0.6069299            0.8105530          
##   0.6024505         0.8037839         0.5976849            0.8059701          
##   0.6065675         0.8058002         0.6036407            0.8074718          
##   0.6043163         0.8046929         0.6004833            0.8066484          
##   Mean_Precision  Mean_Recall  Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.6210115       0.6287187    0.2112101            0.7228728             
##   0.6153662       0.6219906    0.2088407            0.7178054             
##   0.6154902       0.6190170    0.2076558            0.7154668             
##   0.6069299       0.6113308    0.2051688            0.7097735             
##   0.5976849       0.6024505    0.2022092            0.7031172             
##   0.6036407       0.6065675    0.2035110            0.7061838             
##   0.6004833       0.6043163    0.2028000            0.7045046             
## 
## Tuning parameter 'nrounds' was held constant at a value of 500
## Tuning
##  parameter 'min_child_weight' was held constant at a value of 4
## 
## Tuning parameter 'subsample' was held constant at a value of 1
## logLoss was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 500, max_depth = 2, eta
##  = 0.1, gamma = 0, colsample_bytree = 1, min_child_weight = 4 and subsample = 1.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.579887209008348
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  H  L  M
##          H 99 39 24
##          L  0 45 35
##          M  5 17 39
## 
## Overall Statistics
##                                           
##                Accuracy : 0.604           
##                  95% CI : (0.5464, 0.6594)
##     No Information Rate : 0.3432          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.403           
##                                           
##  Mcnemar's Test P-Value : 1.841e-12       
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.9519   0.4455   0.3980
## Specificity            0.6834   0.8267   0.8927
## Pos Pred Value         0.6111   0.5625   0.6393
## Neg Pred Value         0.9645   0.7489   0.7562
## Precision              0.6111   0.5625   0.6393
## Recall                 0.9519   0.4455   0.3980
## F1                     0.7444   0.4972   0.4906
## Prevalence             0.3432   0.3333   0.3234
## Detection Rate         0.3267   0.1485   0.1287
## Detection Prevalence   0.5347   0.2640   0.2013
## Balanced Accuracy      0.8177   0.6361   0.6453
validateAndPrintResult(modelList$glmnet, test.data)
## glmnet 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results:
## 
##   logLoss    AUC        prAUC     Accuracy   Kappa      Mean_F1  
##   0.7679149  0.8061893  0.636981  0.6410982  0.4607653  0.6292992
##   Mean_Sensitivity  Mean_Specificity  Mean_Pos_Pred_Value  Mean_Neg_Pred_Value
##   0.6363986         0.8209763         0.6299618            0.8256112          
##   Mean_Precision  Mean_Recall  Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.6299618       0.6363986    0.2136994            0.7286875             
## 
## Tuning parameter 'alpha' was held constant at a value of 0.3
## Tuning
##  parameter 'lambda' was held constant at a value of 0.009
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.585457190383298
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   H   L   M
##          H 305  11  77
##          L  23 314 147
##          M  85  80 161
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6484          
##                  95% CI : (0.6206, 0.6754)
##     No Information Rate : 0.3433          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4718          
##                                           
##  Mcnemar's Test P-Value : 2.055e-05       
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.7385   0.7753   0.4182
## Specificity            0.8886   0.7870   0.7983
## Pos Pred Value         0.7761   0.6488   0.4939
## Neg Pred Value         0.8667   0.8734   0.7446
## Precision              0.7761   0.6488   0.4939
## Recall                 0.7385   0.7753   0.4182
## F1                     0.7568   0.7064   0.4529
## Prevalence             0.3433   0.3367   0.3200
## Detection Rate         0.2535   0.2610   0.1338
## Detection Prevalence   0.3267   0.4023   0.2710
## Balanced Accuracy      0.8136   0.7811   0.6082
validateAndPrintResult(modelList$glmnet, valid.cl.data)
## glmnet 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results:
## 
##   logLoss    AUC        prAUC     Accuracy   Kappa      Mean_F1  
##   0.7679149  0.8061893  0.636981  0.6410982  0.4607653  0.6292992
##   Mean_Sensitivity  Mean_Specificity  Mean_Pos_Pred_Value  Mean_Neg_Pred_Value
##   0.6363986         0.8209763         0.6299618            0.8256112          
##   Mean_Precision  Mean_Recall  Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.6299618       0.6363986    0.2136994            0.7286875             
## 
## Tuning parameter 'alpha' was held constant at a value of 0.3
## Tuning
##  parameter 'lambda' was held constant at a value of 0.009
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.585457190383298
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  H  L  M
##          H 80 21 21
##          L  0 52 31
##          M 24 28 46
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5875          
##                  95% CI : (0.5297, 0.6434)
##     No Information Rate : 0.3432          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3805          
##                                           
##  Mcnemar's Test P-Value : 8.894e-05       
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.7692   0.5149   0.4694
## Specificity            0.7889   0.8465   0.7463
## Pos Pred Value         0.6557   0.6265   0.4694
## Neg Pred Value         0.8674   0.7773   0.7463
## Precision              0.6557   0.6265   0.4694
## Recall                 0.7692   0.5149   0.4694
## F1                     0.7080   0.5652   0.4694
## Prevalence             0.3432   0.3333   0.3234
## Detection Rate         0.2640   0.1716   0.1518
## Detection Prevalence   0.4026   0.2739   0.3234
## Balanced Accuracy      0.7791   0.6807   0.6079
validateAndPrintResult(modelList$rf, test.data)
## Random Forest 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results:
## 
##   logLoss    AUC        prAUC      Accuracy  Kappa      Mean_F1  
##   0.7764229  0.8052583  0.6443655  0.637183  0.4548583  0.6261077
##   Mean_Sensitivity  Mean_Specificity  Mean_Pos_Pred_Value  Mean_Neg_Pred_Value
##   0.6327145         0.8189463         0.6273004            0.8232955          
##   Mean_Precision  Mean_Recall  Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.6273004       0.6327145    0.2123943            0.7258304             
## 
## Tuning parameter 'mtry' was held constant at a value of 10
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.566664724789867
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   H   L   M
##          H 297  17  79
##          L  31 316 158
##          M  85  72 148
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6326          
##                  95% CI : (0.6046, 0.6599)
##     No Information Rate : 0.3433          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4478          
##                                           
##  Mcnemar's Test P-Value : 5.988e-08       
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.7191   0.7802   0.3844
## Specificity            0.8785   0.7632   0.8081
## Pos Pred Value         0.7557   0.6257   0.4852
## Neg Pred Value         0.8568   0.8725   0.7361
## Precision              0.7557   0.6257   0.4852
## Recall                 0.7191   0.7802   0.3844
## F1                     0.7370   0.6945   0.4290
## Prevalence             0.3433   0.3367   0.3200
## Detection Rate         0.2469   0.2627   0.1230
## Detection Prevalence   0.3267   0.4198   0.2535
## Balanced Accuracy      0.7988   0.7717   0.5962
validateAndPrintResult(modelList$rf, valid.cl.data)
## Random Forest 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results:
## 
##   logLoss    AUC        prAUC      Accuracy  Kappa      Mean_F1  
##   0.7764229  0.8052583  0.6443655  0.637183  0.4548583  0.6261077
##   Mean_Sensitivity  Mean_Specificity  Mean_Pos_Pred_Value  Mean_Neg_Pred_Value
##   0.6327145         0.8189463         0.6273004            0.8232955          
##   Mean_Precision  Mean_Recall  Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.6273004       0.6327145    0.2123943            0.7258304             
## 
## Tuning parameter 'mtry' was held constant at a value of 10
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.566664724789867
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  H  L  M
##          H 88 25 21
##          L  0 56 33
##          M 16 20 44
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6205          
##                  95% CI : (0.5632, 0.6753)
##     No Information Rate : 0.3432          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4292          
##                                           
##  Mcnemar's Test P-Value : 2.391e-06       
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.8462   0.5545   0.4490
## Specificity            0.7688   0.8366   0.8244
## Pos Pred Value         0.6567   0.6292   0.5500
## Neg Pred Value         0.9053   0.7897   0.7578
## Precision              0.6567   0.6292   0.5500
## Recall                 0.8462   0.5545   0.4490
## F1                     0.7395   0.5895   0.4944
## Prevalence             0.3432   0.3333   0.3234
## Detection Rate         0.2904   0.1848   0.1452
## Detection Prevalence   0.4422   0.2937   0.2640
## Balanced Accuracy      0.8075   0.6955   0.6367
validateAndPrintResult(modelList$gbm, test.data)
## Stochastic Gradient Boosting 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.minobsinnode  logLoss    AUC        prAUC    
##   0.01       1                   5              0.7970860  0.8017350  0.6379452
##   0.01       1                  10              0.7967029  0.8019479  0.6363702
##   0.01       1                  15              0.7969933  0.8014128  0.6363660
##   0.01       3                   5              0.7778815  0.8084940  0.6454769
##   0.01       3                  10              0.7776951  0.8091191  0.6466037
##   0.01       3                  15              0.7775230  0.8090222  0.6469494
##   0.01       5                   5              0.7727954  0.8105842  0.6476454
##   0.01       5                  10              0.7726221  0.8103953  0.6485085
##   0.01       5                  15              0.7724906  0.8112027  0.6487959
##   0.10       1                   5              0.7831853  0.8038137  0.6403604
##   0.10       1                  10              0.7859896  0.8024619  0.6376919
##   0.10       1                  15              0.7820925  0.8050564  0.6400199
##   0.10       3                   5              0.8173556  0.8012995  0.6469857
##   0.10       3                  10              0.8193106  0.7987970  0.6393842
##   0.10       3                  15              0.8158653  0.8020714  0.6413022
##   0.10       5                   5              0.8718596  0.7963071  0.6367853
##   0.10       5                  10              0.8622812  0.7993938  0.6429804
##   0.10       5                  15              0.8605530  0.7967428  0.6380514
##   0.30       1                   5              0.8141373  0.7934245  0.6306437
##   0.30       1                  10              0.8093852  0.7971067  0.6354846
##   0.30       1                  15              0.8137835  0.7955163  0.6318490
##   0.30       3                   5              1.0328940  0.7802969  0.6205405
##   0.30       3                  10              1.0414531  0.7770668  0.6178356
##   0.30       3                  15              1.0196153  0.7769416  0.6153279
##   0.30       5                   5              1.2078892  0.7809344  0.6235276
##   0.30       5                  10              1.2320716  0.7711913  0.6087259
##   0.30       5                  15              1.2184465  0.7680801  0.6028812
##   Accuracy   Kappa      Mean_F1    Mean_Sensitivity  Mean_Specificity
##   0.6315139  0.4451687  0.6056652  0.6252168         0.8155071       
##   0.6268898  0.4381002  0.5990852  0.6204141         0.8131440       
##   0.6297327  0.4424632  0.6035038  0.6233753         0.8146086       
##   0.6450073  0.4662700  0.6299346  0.6399955         0.8226752       
##   0.6418090  0.4614145  0.6259860  0.6367284         0.8210489       
##   0.6364758  0.4534161  0.6205193  0.6313559         0.8184055       
##   0.6485609  0.4717146  0.6349474  0.6437288         0.8244979       
##   0.6425126  0.4625664  0.6279106  0.6375671         0.8214506       
##   0.6407349  0.4599161  0.6261444  0.6358584         0.8205658       
##   0.6343348  0.4504137  0.6214377  0.6295675         0.8174407       
##   0.6304310  0.4444873  0.6170718  0.6255908         0.8154557       
##   0.6307855  0.4449879  0.6169925  0.6258732         0.8156174       
##   0.6204834  0.4298616  0.6116397  0.6161000         0.8106598       
##   0.6197691  0.4286818  0.6089492  0.6152697         0.8102324       
##   0.6176378  0.4255104  0.6073076  0.6131235         0.8091903       
##   0.6069751  0.4097143  0.5993592  0.6027814         0.8039770       
##   0.6208370  0.4305453  0.6137082  0.6168690         0.8108651       
##   0.6105190  0.4150676  0.6029256  0.6063820         0.8057571       
##   0.6133734  0.4189726  0.6014018  0.6086365         0.8069941       
##   0.6208308  0.4302212  0.6095685  0.6161740         0.8107442       
##   0.6176370  0.4253446  0.6054912  0.6128929         0.8091082       
##   0.5927498  0.3882413  0.5844321  0.5885879         0.7967642       
##   0.5952542  0.3922162  0.5885796  0.5912674         0.7981450       
##   0.6005803  0.4002080  0.5939768  0.5966115         0.8008033       
##   0.6083877  0.4118537  0.6009379  0.6044192         0.8046471       
##   0.5838745  0.3754205  0.5794714  0.5801981         0.7926145       
##   0.5835271  0.3746698  0.5773306  0.5794632         0.7923528       
##   Mean_Pos_Pred_Value  Mean_Neg_Pred_Value  Mean_Precision  Mean_Recall
##   0.6103709            0.8267023            0.6103709       0.6252168  
##   0.6037873            0.8250766            0.6037873       0.6204141  
##   0.6079959            0.8259423            0.6079959       0.6233753  
##   0.6321172            0.8291784            0.6321172       0.6399955  
##   0.6285924            0.8279356            0.6285924       0.6367284  
##   0.6216607            0.8251693            0.6216607       0.6313559  
##   0.6374184            0.8303460            0.6374184       0.6437288  
##   0.6295434            0.8276807            0.6295434       0.6375671  
##   0.6278351            0.8267441            0.6278351       0.6358584  
##   0.6222568            0.8226338            0.6222568       0.6295675  
##   0.6176608            0.8208076            0.6176608       0.6255908  
##   0.6173933            0.8211245            0.6173933       0.6258732  
##   0.6106109            0.8133363            0.6106109       0.6161000  
##   0.6074786            0.8139173            0.6074786       0.6152697  
##   0.6060231            0.8126394            0.6060231       0.6131235  
##   0.5980222            0.8059655            0.5980222       0.6027814  
##   0.6126897            0.8127827            0.6126897       0.6168690  
##   0.6014219            0.8077581            0.6014219       0.6063820  
##   0.5997755            0.8112085            0.5997755       0.6086365  
##   0.6086933            0.8146523            0.6086933       0.6161740  
##   0.6046140            0.8135697            0.6046140       0.6128929  
##   0.5822637            0.7991150            0.5822637       0.5885879  
##   0.5873584            0.7996838            0.5873584       0.5912674  
##   0.5926529            0.8023183            0.5926529       0.5966115  
##   0.5992886            0.8066503            0.5992886       0.6044192  
##   0.5792217            0.7930418            0.5792217       0.5801981  
##   0.5767630            0.7935956            0.5767630       0.5794632  
##   Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.2105046            0.7203620             
##   0.2089633            0.7167790             
##   0.2099109            0.7189920             
##   0.2150024            0.7313353             
##   0.2139363            0.7288886             
##   0.2121586            0.7248807             
##   0.2161870            0.7341133             
##   0.2141709            0.7295088             
##   0.2135783            0.7282121             
##   0.2114449            0.7235041             
##   0.2101437            0.7205232             
##   0.2102618            0.7207453             
##   0.2068278            0.7133799             
##   0.2065897            0.7127511             
##   0.2058793            0.7111569             
##   0.2023250            0.7033792             
##   0.2069457            0.7138670             
##   0.2035063            0.7060696             
##   0.2044578            0.7078153             
##   0.2069436            0.7134591             
##   0.2058790            0.7110005             
##   0.1975833            0.6926761             
##   0.1984181            0.6947062             
##   0.2001934            0.6987074             
##   0.2027959            0.7045331             
##   0.1946248            0.6864063             
##   0.1945090            0.6859080             
## 
## Tuning parameter 'n.trees' was held constant at a value of 400
## logLoss was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 400, interaction.depth =
##  5, shrinkage = 0.01 and n.minobsinnode = 15.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.583696492574584
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   H   L   M
##          H 303  13  69
##          L  30 321 162
##          M  80  71 154
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6467         
##                  95% CI : (0.619, 0.6738)
##     No Information Rate : 0.3433         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4691         
##                                          
##  Mcnemar's Test P-Value : 2.374e-09      
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.7337   0.7926   0.4000
## Specificity            0.8962   0.7594   0.8154
## Pos Pred Value         0.7870   0.6257   0.5049
## Neg Pred Value         0.8655   0.8783   0.7428
## Precision              0.7870   0.6257   0.5049
## Recall                 0.7337   0.7926   0.4000
## F1                     0.7594   0.6993   0.4464
## Prevalence             0.3433   0.3367   0.3200
## Detection Rate         0.2519   0.2668   0.1280
## Detection Prevalence   0.3200   0.4264   0.2535
## Balanced Accuracy      0.8149   0.7760   0.6077
validateAndPrintResult(modelList$gbm, valid.cl.data)
## Stochastic Gradient Boosting 
## 
## 2814 samples
##   19 predictor
##    3 classes: 'H', 'L', 'M' 
## 
## No pre-processing
## Resampling: Cross-Validated (7 fold) 
## Summary of sample sizes: 2412, 2412, 2412, 2411, 2413, 2412, ... 
## Resampling results across tuning parameters:
## 
##   shrinkage  interaction.depth  n.minobsinnode  logLoss    AUC        prAUC    
##   0.01       1                   5              0.7970860  0.8017350  0.6379452
##   0.01       1                  10              0.7967029  0.8019479  0.6363702
##   0.01       1                  15              0.7969933  0.8014128  0.6363660
##   0.01       3                   5              0.7778815  0.8084940  0.6454769
##   0.01       3                  10              0.7776951  0.8091191  0.6466037
##   0.01       3                  15              0.7775230  0.8090222  0.6469494
##   0.01       5                   5              0.7727954  0.8105842  0.6476454
##   0.01       5                  10              0.7726221  0.8103953  0.6485085
##   0.01       5                  15              0.7724906  0.8112027  0.6487959
##   0.10       1                   5              0.7831853  0.8038137  0.6403604
##   0.10       1                  10              0.7859896  0.8024619  0.6376919
##   0.10       1                  15              0.7820925  0.8050564  0.6400199
##   0.10       3                   5              0.8173556  0.8012995  0.6469857
##   0.10       3                  10              0.8193106  0.7987970  0.6393842
##   0.10       3                  15              0.8158653  0.8020714  0.6413022
##   0.10       5                   5              0.8718596  0.7963071  0.6367853
##   0.10       5                  10              0.8622812  0.7993938  0.6429804
##   0.10       5                  15              0.8605530  0.7967428  0.6380514
##   0.30       1                   5              0.8141373  0.7934245  0.6306437
##   0.30       1                  10              0.8093852  0.7971067  0.6354846
##   0.30       1                  15              0.8137835  0.7955163  0.6318490
##   0.30       3                   5              1.0328940  0.7802969  0.6205405
##   0.30       3                  10              1.0414531  0.7770668  0.6178356
##   0.30       3                  15              1.0196153  0.7769416  0.6153279
##   0.30       5                   5              1.2078892  0.7809344  0.6235276
##   0.30       5                  10              1.2320716  0.7711913  0.6087259
##   0.30       5                  15              1.2184465  0.7680801  0.6028812
##   Accuracy   Kappa      Mean_F1    Mean_Sensitivity  Mean_Specificity
##   0.6315139  0.4451687  0.6056652  0.6252168         0.8155071       
##   0.6268898  0.4381002  0.5990852  0.6204141         0.8131440       
##   0.6297327  0.4424632  0.6035038  0.6233753         0.8146086       
##   0.6450073  0.4662700  0.6299346  0.6399955         0.8226752       
##   0.6418090  0.4614145  0.6259860  0.6367284         0.8210489       
##   0.6364758  0.4534161  0.6205193  0.6313559         0.8184055       
##   0.6485609  0.4717146  0.6349474  0.6437288         0.8244979       
##   0.6425126  0.4625664  0.6279106  0.6375671         0.8214506       
##   0.6407349  0.4599161  0.6261444  0.6358584         0.8205658       
##   0.6343348  0.4504137  0.6214377  0.6295675         0.8174407       
##   0.6304310  0.4444873  0.6170718  0.6255908         0.8154557       
##   0.6307855  0.4449879  0.6169925  0.6258732         0.8156174       
##   0.6204834  0.4298616  0.6116397  0.6161000         0.8106598       
##   0.6197691  0.4286818  0.6089492  0.6152697         0.8102324       
##   0.6176378  0.4255104  0.6073076  0.6131235         0.8091903       
##   0.6069751  0.4097143  0.5993592  0.6027814         0.8039770       
##   0.6208370  0.4305453  0.6137082  0.6168690         0.8108651       
##   0.6105190  0.4150676  0.6029256  0.6063820         0.8057571       
##   0.6133734  0.4189726  0.6014018  0.6086365         0.8069941       
##   0.6208308  0.4302212  0.6095685  0.6161740         0.8107442       
##   0.6176370  0.4253446  0.6054912  0.6128929         0.8091082       
##   0.5927498  0.3882413  0.5844321  0.5885879         0.7967642       
##   0.5952542  0.3922162  0.5885796  0.5912674         0.7981450       
##   0.6005803  0.4002080  0.5939768  0.5966115         0.8008033       
##   0.6083877  0.4118537  0.6009379  0.6044192         0.8046471       
##   0.5838745  0.3754205  0.5794714  0.5801981         0.7926145       
##   0.5835271  0.3746698  0.5773306  0.5794632         0.7923528       
##   Mean_Pos_Pred_Value  Mean_Neg_Pred_Value  Mean_Precision  Mean_Recall
##   0.6103709            0.8267023            0.6103709       0.6252168  
##   0.6037873            0.8250766            0.6037873       0.6204141  
##   0.6079959            0.8259423            0.6079959       0.6233753  
##   0.6321172            0.8291784            0.6321172       0.6399955  
##   0.6285924            0.8279356            0.6285924       0.6367284  
##   0.6216607            0.8251693            0.6216607       0.6313559  
##   0.6374184            0.8303460            0.6374184       0.6437288  
##   0.6295434            0.8276807            0.6295434       0.6375671  
##   0.6278351            0.8267441            0.6278351       0.6358584  
##   0.6222568            0.8226338            0.6222568       0.6295675  
##   0.6176608            0.8208076            0.6176608       0.6255908  
##   0.6173933            0.8211245            0.6173933       0.6258732  
##   0.6106109            0.8133363            0.6106109       0.6161000  
##   0.6074786            0.8139173            0.6074786       0.6152697  
##   0.6060231            0.8126394            0.6060231       0.6131235  
##   0.5980222            0.8059655            0.5980222       0.6027814  
##   0.6126897            0.8127827            0.6126897       0.6168690  
##   0.6014219            0.8077581            0.6014219       0.6063820  
##   0.5997755            0.8112085            0.5997755       0.6086365  
##   0.6086933            0.8146523            0.6086933       0.6161740  
##   0.6046140            0.8135697            0.6046140       0.6128929  
##   0.5822637            0.7991150            0.5822637       0.5885879  
##   0.5873584            0.7996838            0.5873584       0.5912674  
##   0.5926529            0.8023183            0.5926529       0.5966115  
##   0.5992886            0.8066503            0.5992886       0.6044192  
##   0.5792217            0.7930418            0.5792217       0.5801981  
##   0.5767630            0.7935956            0.5767630       0.5794632  
##   Mean_Detection_Rate  Mean_Balanced_Accuracy
##   0.2105046            0.7203620             
##   0.2089633            0.7167790             
##   0.2099109            0.7189920             
##   0.2150024            0.7313353             
##   0.2139363            0.7288886             
##   0.2121586            0.7248807             
##   0.2161870            0.7341133             
##   0.2141709            0.7295088             
##   0.2135783            0.7282121             
##   0.2114449            0.7235041             
##   0.2101437            0.7205232             
##   0.2102618            0.7207453             
##   0.2068278            0.7133799             
##   0.2065897            0.7127511             
##   0.2058793            0.7111569             
##   0.2023250            0.7033792             
##   0.2069457            0.7138670             
##   0.2035063            0.7060696             
##   0.2044578            0.7078153             
##   0.2069436            0.7134591             
##   0.2058790            0.7110005             
##   0.1975833            0.6926761             
##   0.1984181            0.6947062             
##   0.2001934            0.6987074             
##   0.2027959            0.7045331             
##   0.1946248            0.6864063             
##   0.1945090            0.6859080             
## 
## Tuning parameter 'n.trees' was held constant at a value of 400
## logLoss was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 400, interaction.depth =
##  5, shrinkage = 0.01 and n.minobsinnode = 15.
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 2814
## Number of groups: 1
## Observations per group: 2814
## Positive: L
## Negative: H
## Group: Group 1
## Positive: 947
## Negative: 966
## ***Performance Metrics***

## Group 1 Optimal Informedness = 0.583696492574584
## Group 1 AUC-ROC = 0.85

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  H  L  M
##          H 97 24 26
##          L  0 64 30
##          M  7 13 42
## 
## Overall Statistics
##                                           
##                Accuracy : 0.67            
##                  95% CI : (0.6139, 0.7227)
##     No Information Rate : 0.3432          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5029          
##                                           
##  Mcnemar's Test P-Value : 4.736e-09       
## 
## Statistics by Class:
## 
##                      Class: H Class: L Class: M
## Sensitivity            0.9327   0.6337   0.4286
## Specificity            0.7487   0.8515   0.9024
## Pos Pred Value         0.6599   0.6809   0.6774
## Neg Pred Value         0.9551   0.8230   0.7676
## Precision              0.6599   0.6809   0.6774
## Recall                 0.9327   0.6337   0.4286
## F1                     0.7729   0.6564   0.5250
## Prevalence             0.3432   0.3333   0.3234
## Detection Rate         0.3201   0.2112   0.1386
## Detection Prevalence   0.4851   0.3102   0.2046
## Balanced Accuracy      0.8407   0.7426   0.6655
h2o.data <- class.data


# Create the training and test datasets
set.seed(100)

h2o.data$RISK <- as.factor(h2o.data$RISK)

# Step 1: Get row numbers for the training data
trainRowNumbers.cl <-
  createDataPartition(h2o.data$RISK, p = 0.75, list = FALSE)

# Step 2: Create the training  dataset
train.data <- h2o.data[trainRowNumbers.cl, ]

# Step 3: Create the test dataset
test.data <- h2o.data[-trainRowNumbers.cl, ]

train.data <- as.h2o(train.data)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
test.data <- as.h2o(test.data)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Identify predictors and response
y <- "RISK"
x <- setdiff(names(h2o.data), c("RISK"))

# For binary classification, response should be a factor
train.data[,y] <- as.factor(train.data[,y])
test.data[,y] <- as.factor(test.data[,y])

# Number of CV folds (to generate level-one data for stacking)
nfolds <- 5
# # 2. Generate a random grid of models and stack them together
# 
# # Some XGboost/GBM /rf hyperparameters
# hyper_params <- list(
#   ntrees = seq(10, 1000, 1),
#   learn_rate = seq(0.0001, 0.2, 0.0001),
#   max_depth = seq(1, 20, 1),
#   sample_rate = seq(0.5, 1.0, 0.0001),
#   col_sample_rate = seq(0.2, 1.0, 0.0001)
# )
# 
# search_criteria <- list(strategy = "RandomDiscrete",
#                         max_models = 10)
# 
# grid.id <-  as.character(format(Sys.time(), "%S"))
# 
# 
# # Train & Cross-validate a RF
# rf_grid <- h2o.grid(
#   algorithm = "drf",
#   grid_id = paste0("grid_binomial_rf_", grid.id),
#   x = x,
#   y = y,
#   training_frame = train.data,
#   seed = 100,
#   nfolds = nfolds,
#   ntrees = 2500,
#   fold_assignment = "Modulo",
#   keep_cross_validation_predictions = TRUE
# )
# 
# 
# gbm_grid <- h2o.grid(
#   algorithm = "gbm",
#   grid_id = paste0("grid_binomial_gbm_", grid.id),
#   x = x,
#   y = y,
#   training_frame = train.data,
#   # ntrees = seq(10, 1000, 1),
#   seed = 100,
#   nfolds = nfolds,
#   fold_assignment = "Modulo",
#   keep_cross_validation_predictions = TRUE,
#   hyper_params = hyper_params,
#   search_criteria = search_criteria
# )
# 
# 
# 
# # Train the grid
# xgb_grid <- h2o.grid(
#   algorithm = "xgboost",
#   grid_id = paste0("grid_binomial_xgb_", grid.id),
#   x = x,
#   y = y,
#   training_frame = train.data,
#   nfolds = nfolds,
#   seed = 100,
#   fold_assignment = "Modulo",
#   keep_cross_validation_predictions = TRUE,
#   hyper_params = hyper_params,
#   search_criteria = search_criteria
# )
# 
# # Train a stacked ensemble using the H2O and XGBoost models from above
# base.models <- append(gbm_grid@model_ids,
#                       xgb_grid@model_ids)
# 
# # Train a stacked ensemble using the GBM grid
# ensemble <- h2o.stackedEnsemble(
#   x = x,
#   y = y,
#   model_id = paste0("ensemble_gbm_grid_", grid.id, "_24"),
#   training_frame = train.data,
#   base_models = base.models
# )
# 
# # Eval ensemble performance on a test set
# perf <- h2o.performance(ensemble, newdata = test.data)
# 
# # Compare to base learner performance on the test set
# .getmean_per_class_error <-
#   function(mm)
#     h2o.mean_per_class_error(h2o.performance(h2o.getModel(mm), newdata = test.data))
# 
# baselearner_aucs <- sapply(base.models, .getmean_per_class_error)
# baselearner_best_auc_test <- max(baselearner_aucs)
# ensemble_auc_test <- h2o.mean_per_class_error(perf)
# print(sprintf("Best Base-learner Test Mean per class error:  %s", baselearner_best_auc_test))
# print(sprintf("Ensemble Test Mean per class error:  %s", ensemble_auc_test))
# 
# # Generate predictions on a test set (if neccessary)
# pred <- h2o.predict(ensemble, newdata = test.data)
# 
# # Sort the grid by CV AUC for GBM
# get_gbm_grid <- h2o.getGrid(grid_id = gbm_grid@grid_id, sort_by = "mean_per_class_error", decreasing = TRUE)
# get_gbm_grid
# gbm_grid_top_model <- get_gbm_grid@summary_table[1, "model_ids"]
# gbm_grid_top_model
# 
# # Sort the grid by CV AUC for XGBOOST
# get_xgb_grid <- h2o.getGrid(grid_id = xgb_grid@grid_id, sort_by = "mean_per_class_error", decreasing = TRUE)
# get_xgb_grid
# xgb_grid_top_model <- get_xgb_grid@summary_table[1, "model_ids"]
# xgb_grid_top_model
# 
# # Sort the grid by CV AUC for XGBOOST
# get_rf_grid <- h2o.getGrid(grid_id = rf_grid@grid_id, sort_by = "mean_per_class_error", decreasing = TRUE)
# get_rf_grid
# rf_grid_top_model <- get_rf_grid@summary_table[1, "model_ids"]
# rf_grid_top_model
# Use AutoML to find a list of candidate models (i.e., leaderboard)
auto_ml <- h2o.automl(
  x = x,
  y = y,
  training_frame = train.data,
  nfolds = 5,
  max_runtime_secs = 60 * 120,
  max_models = 10,
  keep_cross_validation_predictions = FALSE,
  sort_metric = "mean_per_class_error",
  seed = 123,
  stopping_rounds = 50,
  stopping_metric = "mean_per_class_error",
  stopping_tolerance = 0
)
## 
  |                                                                            
  |                                                                      |   0%
## 23:28:58.300: Stopping tolerance set by the user is < 70% of the recommended default of 0.018214966464911487, so models may take a long time to converge or may not converge at all.
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |======================================================================| 100%
auto_ml@leaderboard
##                                              model_id mean_per_class_error
## 1                    XGBoost_3_AutoML_20200226_232858            0.3621127
## 2 StackedEnsemble_BestOfFamily_AutoML_20200226_232858            0.3708717
## 3                        DRF_1_AutoML_20200226_232858            0.3725618
## 4    StackedEnsemble_AllModels_AutoML_20200226_232858            0.3736785
## 5                    XGBoost_1_AutoML_20200226_232858            0.3788496
## 6                    XGBoost_2_AutoML_20200226_232858            0.3789230
##     logloss      rmse       mse
## 1 0.7721607 0.5299974 0.2808972
## 2 0.7720142 0.5239900 0.2745655
## 3 0.8395249 0.5176616 0.2679736
## 4 0.7715062 0.5236370 0.2741957
## 5 0.7714698 0.5284186 0.2792263
## 6 0.7735021 0.5283216 0.2791237
## 
## [12 rows x 5 columns]
# Assess the leader board; the following truncates the results to show the top 
# and bottom 15 models. You can get the top model with auto_ml@leader
auto_ml@leaderboard %>% 
  as.data.frame() %>%
  dplyr::select(model_id, mean_per_class_error) %>%
  dplyr::slice(1:25)
##                                               model_id mean_per_class_error
## 1                     XGBoost_3_AutoML_20200226_232858            0.3621127
## 2  StackedEnsemble_BestOfFamily_AutoML_20200226_232858            0.3708717
## 3                         DRF_1_AutoML_20200226_232858            0.3725618
## 4     StackedEnsemble_AllModels_AutoML_20200226_232858            0.3736785
## 5                     XGBoost_1_AutoML_20200226_232858            0.3788496
## 6                     XGBoost_2_AutoML_20200226_232858            0.3789230
## 7                         GBM_1_AutoML_20200226_232858            0.3903032
## 8                         GBM_2_AutoML_20200226_232858            0.4012999
## 9                         GBM_5_AutoML_20200226_232858            0.4040333
## 10                        GBM_4_AutoML_20200226_232858            0.4053515
## 11                        GBM_3_AutoML_20200226_232858            0.4058087
## 12                        GLM_1_AutoML_20200226_232858            0.4239250